rm(list=ls())
setwd("YOUR WORKING DIRECTORY HERE")

library(foreign)
library(ggplot2)
library(countrycode)
library(tidyr)
library(readstata13)
library(stargazer)
library(lme4)
library(sjPlot)
library(dplyr)
library(readxl)
library(peacesciencer)

#################
#data on state support for religion & discrimination against minorities 
#need to transform into country-year format
#################
#https://www.thearda.com/data-archive?fid=RAS3COMP&tab=3
#The Religion and State Project, Main Dataset and Societal Module, Round 3
dfs <- read_xlsx("RAS_V3.xlsx")
names(dfs) <- tolower(names(dfs))

gathered.df <- gather(dfs, key, value, -country, -region)

#create year variable
gathered.df$year <- NA
RIGHT = function(x,n){
  substring(x,nchar(x)-n+1)
}
gathered.df$year <- RIGHT(gathered.df$key,4)

gathered.df <- subset(gathered.df, year=="1990" | year=="1991" | year=="1992" | year=="1993" | year=="1994" | year=="1995" | year=="1996" | year=="1997" | year=="1998" | year=="1999" | year=="2000" | year=="2001" | year=="2002" | year=="2003" | year=="2004" | year=="2005" | year=="2006" | year=="2007" | year=="2008" | year=="2009" | year=="2010" | year=="2011" | year=="2012" | year=="2013" | year=="2014")

#create variable variable
gathered.df$length <- nchar(gathered.df$key, type = "chars", allowNA = FALSE, keepNA = NA)

gathered.df$RSV <- NA

gathered.df <- subset(gathered.df, length<17)

gathered.df <- gathered.df %>% 
  mutate(RSV = case_when(gathered.df$length==16 ~ substr(gathered.df$key, 1, 12), # both tests: group A
                         gathered.df$length==15 ~ substr(gathered.df$key, 1, 11), # one test: group 
                         gathered.df$length==14 ~ substr(gathered.df$key, 1, 10), # one test: group 
                         gathered.df$length==13 ~ substr(gathered.df$key, 1, 9), # one test: group 
                         gathered.df$length==12 ~ substr(gathered.df$key, 1, 8), # one test: group 
                         gathered.df$length==11 ~ substr(gathered.df$key, 1, 7), # one test: group 
                         gathered.df$length==10 ~ substr(gathered.df$key, 1, 6), # one test: group 
                         gathered.df$length==9 ~ substr(gathered.df$key, 1, 5), # one test: group 
                         gathered.df$length==8 ~ substr(gathered.df$key, 1, 4)
  ))


table(gathered.df$RSV)

rsd2 <- dplyr::select(gathered.df, -c(key, length))

rsd2 <- rsd2 %>% distinct(country, year, RSV, .keep_all= TRUE) 

data <- spread(rsd2, RSV, value)

rsd2 <- data

rsd2$year <- as.numeric(rsd2$year)
rsd2$socdisgen <- as.numeric(as.character(rsd2$wsocdisx))

rsd2$iso3n <- countrycode(rsd2$country, "country.name", "iso3n")

#create general discrimination variable by adding all individual mx variables
rsd2 <- rsd2 %>% mutate_if(is.character,as.numeric)
rsd2$mxsum <- NA
rsd2$mxsum <- rsd2$mx01x+rsd2$mx02x+rsd2$mx03x+rsd2$mx04x+rsd2$mx05x+rsd2$mx06x+rsd2$mx07x+rsd2$mx08x+rsd2$mx09x+rsd2$mx10x+rsd2$mx11x+rsd2$mx12x+rsd2$mx13x+rsd2$mx14x+rsd2$mx15x+rsd2$mx16x+rsd2$mx17x+rsd2$mx18x+rsd2$mx19x+rsd2$mx20x+rsd2$mx21x+rsd2$mx22x+rsd2$mx23x+rsd2$mx24x+rsd2$mx25x+rsd2$mx26x+rsd2$mx27x +rsd2$mx28x+rsd2$mx29x+rsd2$mx30x+rsd2$mx31x+rsd2$mx32x+rsd2$mx33x+rsd2$mx34x+rsd2$mx35x + rsd2$mx36x


#create lx sum
rsd2$lxx <- NA
rsd2$lxx <- as.numeric(rsd2$lxx)
rsd2$lxx<- rsd2$lx01x+rsd2$lx02x+rsd2$lx03x+rsd2$lx04x+rsd2$lx05x+rsd2$lx06x+rsd2$lx07x+rsd2$lx08x+rsd2$lx09x+rsd2$lx10x+rsd2$lx11x+rsd2$lx12x+rsd2$lx13x+rsd2$lx14x+rsd2$lx15x+rsd2$lx16x+rsd2$lx17x+rsd2$lx18x+rsd2$lx19x+rsd2$lx20x+rsd2$lx21x+rsd2$lx22x+rsd2$lx23x+rsd2$lx24x+rsd2$lx25x+rsd2$lx26x+rsd2$lx27x+rsd2$lx28x+rsd2$lx29x+rsd2$lx30x+rsd2$lx31x+rsd2$lx32x+rsd2$lx33x+rsd2$lx34x+rsd2$lx35x+rsd2$lx36x+rsd2$lx37x+rsd2$lx38x+rsd2$lx39x+rsd2$lx40x+rsd2$lx41x+rsd2$lx42x+rsd2$lx43x+rsd2$lx44x+rsd2$lx45x+rsd2$lx46x+rsd2$lx47x+rsd2$lx48x+rsd2$lx49x+rsd2$lx50x+rsd2$lx51x+rsd2$lx52x

#create lx categories
rsd2$lxrelationships <- NA
rsd2$lxrelationships <- rsd2$lx01x+rsd2$lx02x+rsd2$lx03x+rsd2$lx04x+rsd2$lx05x+rsd2$lx06x+rsd2$lx07x

rsd2$lxwomenrestrictions <- NA
rsd2$lxwomenrestrictions <- rsd2$lx08x+rsd2$lx09x+rsd2$lx10x+rsd2$lx11x

rsd2$lxenforcereligion <- NA
rsd2$lxenforcereligion <- rsd2$lx22x+rsd2$lx23x+rsd2$lx24x+rsd2$lx25x+rsd2$lx26x

rsd2$lxfundingreligion <- NA
rsd2$lxfundingreligion <- rsd2$lx27x+rsd2$lx28x+rsd2$lx29x+rsd2$lx30x+rsd2$lx31x+rsd2$lx32x+rsd2$lx33x+rsd2$lx34x+rsd2$lx35x+rsd2$lx36x+rsd2$lx37x

rsd2$lxentanglement <- NA
rsd2$lxentanglement <- rsd2$lx38x+rsd2$lx39x+rsd2$lx40x+rsd2$lx41x+rsd2$lx42x+rsd2$lx43x

rsd2$country <- countrycode(rsd2$iso3n, "iso3n", "country.name")
rsd2$country <- as.character(rsd2$country)
rsd2$country[rsd2$country=="Timor"] <- "East Timor"
rsd2$ISO3 <- NA
rsd2$ISO3 <- countrycode(rsd2$country, 'country.name', 'iso3c')
rsd2 <- distinct(rsd2, ISO3, year, .keep_all=TRUE)

rsd2 <- select(rsd2, region, ISO3, year, lxentanglement, lxfundingreligion, lxenforcereligion, lxwomenrestrictions, lxrelationships, lxx, mxsum, socdisgen)

##########################
####get country level controls
##########################
#get polity data
#using package peacesciencer
#can also use for gdp and population but missing germany 

controls <- create_stateyears(subset_years = c(1990:2014)) %>%
  add_democracy()

controls$ISO3 <-countrycode(controls$`statenme`, 'country.name', 'iso3c')
#drop duplicates
controls <- controls %>% distinct(ISO3, year, .keep_all = TRUE)

df <- left_join(rsd2, controls, by=c("ISO3", "year"))

#population
#https://data.worldbank.org/indicator/SP.POP.TOTL

pop <- read.csv("worldbankpopulation.csv")
pop <- dplyr::select(pop, -c("Country.Code", "Indicator.Name", "Indicator.Code"))
data_long <- gather(pop, year, population, -`Country.Name`)
pop <- data_long
pop$ISO3 <- countrycode(pop$`Country.Name`, 'country.name', 'iso3c')
#remove x's in front of years
pop$year <- substring(pop$year, 2)
pop$year <- as.numeric(pop$year)
#drop duplicates
pop <- pop %>% distinct(ISO3, year, .keep_all = TRUE)

df <- left_join(df, pop, by=c("ISO3", "year"))

#gdppc
#https://data.worldbank.org/indicator/NY.GDP.PCAP.CD
gdp <- read.csv("gdppcworldbank.csv")
gdp <- dplyr::select(gdp, -c("Country.Code", "Indicator.Name", "Indicator.Code"))
data_long <- gather(gdp, year, gdppc, -`Country.Name`)
gdp <- data_long
gdp$ISO3 <- countrycode(gdp$`Country.Name`, 'country.name', 'iso3c')
#remove x's in front of years
gdp$year <- substring(gdp$year, 2)
gdp$year <- as.numeric(gdp$year)
#drop duplicates
gdp <- gdp %>% distinct(ISO3, year, .keep_all = TRUE)

df <- left_join(df, gdp, by=c("ISO3", "year"))

#gini
#https://data.worldbank.org/indicator/SI.POV.GINI
library(readxl)
gini <- read_xlsx("gini.xlsx")
data_long <- gather(gini, year, gini, -`Country Name`)
gini <- data_long
gini$ISO3 <- countrycode(gini$`Country Name`, 'country.name', 'iso3c')
gini$year <- as.numeric(gini$year)
gini <- gini %>% 
  dplyr::group_by(ISO3) %>%
  fill(gini, .direction="downup")

#drop duplicates
gini <- gini %>% distinct(ISO3, year, .keep_all = TRUE)

df <- left_join(df, gini, by=c("ISO3", "year"))

#get data on durability (hand coded by Jonathan Fox)
#get data on majority religion (hand coded by Jonathan Fox)
library(readstata13)
controls <- read.dta13("RAS-Minorities-long-1990-2014.dta")
#drop duplicates, data has separate observation for each minority in each country-not needed for country level)
controls <- controls %>% distinct(country, year, .keep_all = TRUE)
controls$ISO3 <- countrycode(controls$country, 'country.name', 'iso3c')
controls <- select(controls, zdurable, emajrel, ISO3, year)

#drop na prior to merge
controls <- subset(controls, !is.na(ISO3) & !is.na(year))
controls <- subset(controls, !is.na(zdurable))

#merge on country year
rsd <- left_join(df, controls, by=c("ISO3", "year"))


##########################
#####get WVS data
##########################
#https://www.worldvaluessurvey.org/WVSDocumentationWVL.jsp
#Time series 1981-2022
wvs <- read.dta13("WVS_TimeSeries_1981_2022_Stata_v3_0.dta")

#prep data for merging
wvs$F025 <- as.character(wvs$F025)

df <- wvs
#create country indicator
df$country <- NA
df$country <- sub("^([[:alpha:]]*).*", "\\1", df$S025)
df$country[df$country=="United"] <- "United States"
df$country[df$country=="Czech"] <- "Czech Republic"
df$country[df$country=="El"] <- "El Salvador"
df$country[df$country=="Viet"] <- "Vietnam"
df$country[df$country=="Saudi"] <- "Saudi Arabia"
df$country[df$country=="Great"] <- "Great Britain"
df$country[df$country=="New"] <- "New Zealand"
df$country[df$country=="South"] <- "South Sudan"
df$country[df$country=="Dominican"] <- "Dominican Republic"

#create country code
library(countrycode)
df$ISO3 <- NA
df$ISO3 <- countrycode(df$country, 'country.name', 'iso3c')

#create year indicator
library(stringr)
df$year<- NA
numextract <- function(string){ 
  str_extract(string, "\\-*\\d+\\.*\\d*")
}
df$year<- numextract(df$S025)
df$year <- as.numeric(df$year)

df$group <- df$F025

wvs <- df
#merge on country year
#
#
#
#
#
df <- left_join(wvs, rsd, by=c("ISO3", "year"))

df$country <- countrycode(df$ISO3, 'iso3c', 'country.name')

#drop obs with missing data
df <- subset(df, group!="No answer/refused")
df <- subset(df, group!="Not asked")
df <- subset(df, group!="Don't know")
df <- subset(df, group!="Other missing; Multiple answers Mail (EVS)")

#identify majority members by matching emajrel to wvs group
df$maj <- 0
df$maj[df$emajrel=="Catholic" & df$group=="Roman Catholic"] <- 1
df$maj[df$emajrel=="Orthodox Christian" & df$group=="Orthodox (Russian/Greek/etc.)"] <- 1
df$maj[df$emajrel=="Protestant Christian" & df$group=="Protestant"] <- 1
df$maj[df$emajrel=="Christian (general)" & (df$group=="Other Christian (Evangelical/Pentecostal/Fee church/etc.)" | df$group=="Protestant" | df$group=="Roman Catholic" | df$group=="Orthodox (Russian/Greek/etc.)")] <- 1

#categorical 
df$relcat <- 0
df$relcat[df$group=="Do not belong to a denomination"] <- 1
df$relcat[df$maj==1] <- 2

#religion variables
#religion variables
df$relig_import <- NA
df$relig_import[df$A006=="Not at all important"] <- 0
df$relig_import[df$A006=="Not very important"] <- 1
df$relig_import[df$A006=="Rather important"] <- 2
df$relig_import[df$A006=="Very important"] <- 3

df$service_freq <- NA
df$service_freq[df$F028=="Never practically never"] <- 0
df$service_freq[df$F028=="Less often"] <- 1
df$service_freq[df$F028=="Once a year"] <- 2
df$service_freq[df$F028=="Only on special holy days/Christmas/Easter days" | df$F028=="Other specific holy days"] <- 3
df$service_freq[df$F028=="Once a month"] <- 4
df$service_freq[df$F028=="Once a week"] <- 5
df$service_freq[df$F028=="More than once a week"] <- 6

#well being-happiness
df$happiness <-NA
df$happiness[df$A008=="Not at all happy"] <- 0
df$happiness[df$A008=="Not very happy"] <- 1
df$happiness[df$A008=="Quite happy"] <- 2
df$happiness[df$A008=="Very happy"] <- 3

df$wellbeing <- NA
df$wellbeing[df$A170=="Dissatisfied"] <- 0
df$wellbeing[df$A170=="2"] <- 1
df$wellbeing[df$A170=="3"] <- 2
df$wellbeing[df$A170=="4"] <- 3
df$wellbeing[df$A170=="5"] <- 4
df$wellbeing[df$A170=="6"] <- 5
df$wellbeing[df$A170=="7"] <- 6
df$wellbeing[df$A170=="8"] <- 7
df$wellbeing[df$A170=="9"] <- 8
df$wellbeing[df$A170=="Satisfied"] <- 9

df$female <- NA
df$female[df$X001=="Male"] <- 0
df$female[df$X001=="Female"] <- 1

df$yearborn <- as.numeric(as.character(df$X002))
df$yearsurvey <- as.numeric(as.character(df$S020))
df$age <- df$yearsurvey-df$yearborn
df$age[df$age<16] <- NA

df$education <- as.numeric(df$X025)
df$education[df$X025=="Missing; Unknown" | df$X025=="Not asked in survey" | df$X025=="No answer" | df$X025=="Don´t know"] <- NA
df$education[df$education==3] <- 5
df$education <- df$education-5

df$minority <- NA
df$minority[df$relcat==0] <- 1
df$minority[df$relcat==1 | df$relcat==2] <- 0

#confidence in government
table(df$E069_11)
df$govconf <- NA
df$govconf[df$E069_11=="None at all"] <- 0
df$govconf[df$E069_11=="Not very much"] <- 1
df$govconf[df$E069_11=="Quite a lot"] <- 2
df$govconf[df$E069_11=="A great deal"] <- 3

#confidence in parliament
table(df$E069_07)
df$parliamentconf <- NA
df$parliamentconf[df$E069_07=="None at all"] <- 0
df$parliamentconf[df$E069_07=="Not very much"] <- 1
df$parliamentconf[df$E069_07=="Quite a lot"] <- 2
df$parliamentconf[df$E069_07=="A great deal"] <- 3


#drop unneeded variables
df <- dplyr::select(df, year, ISO3, region, emajrel, S007, S002VS, lxx, govconf, parliamentconf, minority, relig_import, education, wellbeing, gini, mxsum, population, gdppc, polity2, zdurable, country, relcat, female, age, socdisgen, service_freq, lxentanglement, lxrelationships, lxfundingreligion, lxenforcereligion)

#christian majority countries only
df <- subset(df, emajrel=="Catholic" | emajrel=="Orthodox Christian" | emajrel=="Protestant Christian" | emajrel=="Christian (general)")

write.csv(df, "apsr_religion_legitimacy.csv")
